{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 信用卡欺诈检测\n",
    "### 导入库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from imblearn.ensemble import BalancedBaggingClassifier,BalancedRandomForestClassifier\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the data\n",
    "data = pd.read_csv(\"./creditcard.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据预处理\n",
    "#### 查看数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Time</th>\n",
       "      <th>V1</th>\n",
       "      <th>V2</th>\n",
       "      <th>V3</th>\n",
       "      <th>V4</th>\n",
       "      <th>V5</th>\n",
       "      <th>V6</th>\n",
       "      <th>V7</th>\n",
       "      <th>V8</th>\n",
       "      <th>V9</th>\n",
       "      <th>...</th>\n",
       "      <th>V21</th>\n",
       "      <th>V22</th>\n",
       "      <th>V23</th>\n",
       "      <th>V24</th>\n",
       "      <th>V25</th>\n",
       "      <th>V26</th>\n",
       "      <th>V27</th>\n",
       "      <th>V28</th>\n",
       "      <th>Amount</th>\n",
       "      <th>Class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>-1.359807</td>\n",
       "      <td>-0.072781</td>\n",
       "      <td>2.536347</td>\n",
       "      <td>1.378155</td>\n",
       "      <td>-0.338321</td>\n",
       "      <td>0.462388</td>\n",
       "      <td>0.239599</td>\n",
       "      <td>0.098698</td>\n",
       "      <td>0.363787</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.018307</td>\n",
       "      <td>0.277838</td>\n",
       "      <td>-0.110474</td>\n",
       "      <td>0.066928</td>\n",
       "      <td>0.128539</td>\n",
       "      <td>-0.189115</td>\n",
       "      <td>0.133558</td>\n",
       "      <td>-0.021053</td>\n",
       "      <td>149.62</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.191857</td>\n",
       "      <td>0.266151</td>\n",
       "      <td>0.166480</td>\n",
       "      <td>0.448154</td>\n",
       "      <td>0.060018</td>\n",
       "      <td>-0.082361</td>\n",
       "      <td>-0.078803</td>\n",
       "      <td>0.085102</td>\n",
       "      <td>-0.255425</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.225775</td>\n",
       "      <td>-0.638672</td>\n",
       "      <td>0.101288</td>\n",
       "      <td>-0.339846</td>\n",
       "      <td>0.167170</td>\n",
       "      <td>0.125895</td>\n",
       "      <td>-0.008983</td>\n",
       "      <td>0.014724</td>\n",
       "      <td>2.69</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>-1.358354</td>\n",
       "      <td>-1.340163</td>\n",
       "      <td>1.773209</td>\n",
       "      <td>0.379780</td>\n",
       "      <td>-0.503198</td>\n",
       "      <td>1.800499</td>\n",
       "      <td>0.791461</td>\n",
       "      <td>0.247676</td>\n",
       "      <td>-1.514654</td>\n",
       "      <td>...</td>\n",
       "      <td>0.247998</td>\n",
       "      <td>0.771679</td>\n",
       "      <td>0.909412</td>\n",
       "      <td>-0.689281</td>\n",
       "      <td>-0.327642</td>\n",
       "      <td>-0.139097</td>\n",
       "      <td>-0.055353</td>\n",
       "      <td>-0.059752</td>\n",
       "      <td>378.66</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>-0.966272</td>\n",
       "      <td>-0.185226</td>\n",
       "      <td>1.792993</td>\n",
       "      <td>-0.863291</td>\n",
       "      <td>-0.010309</td>\n",
       "      <td>1.247203</td>\n",
       "      <td>0.237609</td>\n",
       "      <td>0.377436</td>\n",
       "      <td>-1.387024</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.108300</td>\n",
       "      <td>0.005274</td>\n",
       "      <td>-0.190321</td>\n",
       "      <td>-1.175575</td>\n",
       "      <td>0.647376</td>\n",
       "      <td>-0.221929</td>\n",
       "      <td>0.062723</td>\n",
       "      <td>0.061458</td>\n",
       "      <td>123.50</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>-1.158233</td>\n",
       "      <td>0.877737</td>\n",
       "      <td>1.548718</td>\n",
       "      <td>0.403034</td>\n",
       "      <td>-0.407193</td>\n",
       "      <td>0.095921</td>\n",
       "      <td>0.592941</td>\n",
       "      <td>-0.270533</td>\n",
       "      <td>0.817739</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.009431</td>\n",
       "      <td>0.798278</td>\n",
       "      <td>-0.137458</td>\n",
       "      <td>0.141267</td>\n",
       "      <td>-0.206010</td>\n",
       "      <td>0.502292</td>\n",
       "      <td>0.219422</td>\n",
       "      <td>0.215153</td>\n",
       "      <td>69.99</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 31 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Time        V1        V2        V3        V4        V5        V6        V7  \\\n",
       "0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   \n",
       "1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   \n",
       "2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   \n",
       "3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   \n",
       "4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   \n",
       "\n",
       "         V8        V9  ...         V21       V22       V23       V24  \\\n",
       "0  0.098698  0.363787  ...   -0.018307  0.277838 -0.110474  0.066928   \n",
       "1  0.085102 -0.255425  ...   -0.225775 -0.638672  0.101288 -0.339846   \n",
       "2  0.247676 -1.514654  ...    0.247998  0.771679  0.909412 -0.689281   \n",
       "3  0.377436 -1.387024  ...   -0.108300  0.005274 -0.190321 -1.175575   \n",
       "4 -0.270533  0.817739  ...   -0.009431  0.798278 -0.137458  0.141267   \n",
       "\n",
       "        V25       V26       V27       V28  Amount  Class  \n",
       "0  0.128539 -0.189115  0.133558 -0.021053  149.62      0  \n",
       "1  0.167170  0.125895 -0.008983  0.014724    2.69      0  \n",
       "2 -0.327642 -0.139097 -0.055353 -0.059752  378.66      0  \n",
       "3  0.647376 -0.221929  0.062723  0.061458  123.50      0  \n",
       "4 -0.206010  0.502292  0.219422  0.215153   69.99      0  \n",
       "\n",
       "[5 rows x 31 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(284807, 31)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Class     0\n",
       "V14       0\n",
       "V1        0\n",
       "V2        0\n",
       "V3        0\n",
       "V4        0\n",
       "V5        0\n",
       "V6        0\n",
       "V7        0\n",
       "V8        0\n",
       "V9        0\n",
       "V10       0\n",
       "V11       0\n",
       "V12       0\n",
       "V13       0\n",
       "V15       0\n",
       "Amount    0\n",
       "V16       0\n",
       "V17       0\n",
       "V18       0\n",
       "V19       0\n",
       "V20       0\n",
       "V21       0\n",
       "V22       0\n",
       "V23       0\n",
       "V24       0\n",
       "V25       0\n",
       "V26       0\n",
       "V27       0\n",
       "V28       0\n",
       "Time      0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isnull().sum().sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',\n",
       "       'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18',\n",
       "       'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27',\n",
       "       'V28', 'Amount', 'Class'], dtype=object)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "colNames = data.columns.values\n",
    "colNames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    284315\n",
      "1       492\n",
      "Name: Class, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "count_classes = pd.value_counts(data['Class'], sort = True).sort_index()\n",
    "print(count_classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.drop('Time',axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 特征工程（特征、标签提取）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = data.iloc[:, data.columns != 'Class']\n",
    "y = data.iloc[:, data.columns == 'Class']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "rus = RandomUnderSampler(random_state=0, replacement=True)\n",
    "X_resampled, y_resampled = rus.fit_sample(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/python/lib/python3.5/site-packages/sklearn/ensemble/bagging.py:621: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.977.\n",
      "precision/recall/f1-score:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       1.00      0.98      0.99     71082\n",
      "           1       0.06      0.86      0.11       120\n",
      "\n",
      "   micro avg       0.98      0.98      0.98     71202\n",
      "   macro avg       0.53      0.92      0.55     71202\n",
      "weighted avg       1.00      0.98      0.99     71202\n",
      "\n",
      "confusion_matrix:\n",
      "[[69477  1605]\n",
      " [   17   103]]\n"
     ]
    }
   ],
   "source": [
    "bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),\n",
    "                                ratio='auto',\n",
    "                                replacement=False,\n",
    "                                random_state=0)\n",
    "bbc.fit(X_train, y_train) \n",
    "\n",
    "y_pred = bbc.predict(X_test)\n",
    "print ('Accuracy: %.3f.' % accuracy_score(y_test,y_pred))\n",
    "print ('precision/recall/f1-score:')\n",
    "print (classification_report(y_test,y_pred))\n",
    "print ('confusion_matrix:')\n",
    "print (confusion_matrix(y_test,y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./bbc_classification.model']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.externals import joblib\n",
    "bbc_model_path = \"./bbc_classification.model\"\n",
    "#保存训练好的bbc模型\n",
    "joblib.dump(bbc, bbc_model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 加载训练好的xgboost模型\n",
    "model = joblib.load(bbc_model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.977.\n",
      "precision/recall/f1-score:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       1.00      0.98      0.99     71082\n",
      "           1       0.06      0.86      0.11       120\n",
      "\n",
      "   micro avg       0.98      0.98      0.98     71202\n",
      "   macro avg       0.53      0.92      0.55     71202\n",
      "weighted avg       1.00      0.98      0.99     71202\n",
      "\n",
      "confusion_matrix:\n",
      "[[69477  1605]\n",
      " [   17   103]]\n"
     ]
    }
   ],
   "source": [
    "y_pred = model.predict(X_test)\n",
    "print ('Accuracy: %.3f.' % accuracy_score(y_test,y_pred))\n",
    "print ('precision/recall/f1-score:')\n",
    "print (classification_report(y_test,y_pred))\n",
    "print ('confusion_matrix:')\n",
    "print (confusion_matrix(y_test,y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/python/lib/python3.5/site-packages/ipykernel_launcher.py:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.979.\n",
      "precision/recall/f1-score:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       1.00      0.98      0.99     71082\n",
      "           1       0.07      0.90      0.13       120\n",
      "\n",
      "   micro avg       0.98      0.98      0.98     71202\n",
      "   macro avg       0.53      0.94      0.56     71202\n",
      "weighted avg       1.00      0.98      0.99     71202\n",
      "\n",
      "confusion_matrix:\n",
      "[[69626  1456]\n",
      " [   12   108]]\n"
     ]
    }
   ],
   "source": [
    "model = BalancedRandomForestClassifier(n_jobs=2, random_state=1)\n",
    "model.fit(X_train, y_train) \n",
    "\n",
    "y_pred = model.predict(X_test)\n",
    "print ('Accuracy: %.3f.' % accuracy_score(y_test,y_pred))\n",
    "print ('precision/recall/f1-score:')\n",
    "print (classification_report(y_test,y_pred))\n",
    "print ('confusion_matrix:')\n",
    "print (confusion_matrix(y_test,y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
